load("//:def.bzl", "gen_cpp_code")

moe_kernels_sm90 = [
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<16>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<32>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<64>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<128>, cute::Int<256>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<half, half, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_bfloat16, __nv_bfloat16, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<64>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<float, float, float, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<32>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::NONE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<1>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<1>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, half, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);',
    'template void sm90_generic_moe_gemm_kernelLauncher<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, tensorrt_llm::HopperGroupedGemmInput::EpilogueFusion::FINALIZE, cute::Shape<cute::Int<256>, cute::Int<128>, cute::Int<128>>, cute::Shape<cute::Int<2>, cute::Int<2>, cute::Int<1>>, false>(HopperGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);'
]

moe_kernels_sm80 = [
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 256, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 256, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 256, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 256, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 256, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 16, 256, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 256, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 256, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 256, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 256, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 256, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 16, 256, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 32, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 32, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 32, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 32, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 32, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 32, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 32, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 32, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 32, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 32, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 32, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 32, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 64, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 64, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 64, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 64, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 64, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 64, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 64, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 64, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 64, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 64, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 64, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 64, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 128, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 128, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 128, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 128, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 128, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::half_t, cutlass::half_t, 128, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::half_t const* A, cutlass::half_t const* B, cutlass::half_t const* biases, bool bias_is_broadcast, cutlass::half_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 128, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 128, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 128, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSilu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 128, 128, 64, 2, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 128, 128, 64, 3, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
    'template void sm80_generic_fused_moe_gemm_kernelLauncher<cutlass::bfloat16_t, cutlass::bfloat16_t, 128, 128, 64, 4, tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGelu>(cutlass::bfloat16_t const* A, cutlass::bfloat16_t const* B, cutlass::bfloat16_t const* biases, bool bias_is_broadcast, cutlass::bfloat16_t* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);',
]

def gen_moe_kernels():
    template_header = """
#include "rtp_llm/cpp/cuda/cutlass/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl"
#include "rtp_llm/cpp/cuda/cutlass/cutlass_kernels/moe_gemm/launchers/moe_gemm_launcher_sm90.inl"

namespace tensorrt_llm::kernels::cutlass_kernels {
"""
    template = """
{0}
"""
    template_tail = """
}
"""

    gen_cpp_code("moe_inst_sm80", [moe_kernels_sm80],
                 template_header, template, template_tail, element_per_file=4, suffix=".cu")

    gen_cpp_code("moe_inst_sm90", [moe_kernels_sm90],
                 template_header, template, template_tail, element_per_file=4, suffix=".cu")
